source("https://bioconductor.org/biocLite.R")
biocLite("AnnotationHub")
library(AnnotationHub)
library(biomaRt)
library(tidyverse) # ggplot2, dplyr, tidyr, readr, purrr, tibble
library(stringr) # string manipulation
library(data.table)
proj_dir <- "/mnt/hd2/Dropbox (Partners HealthCare)/projects/2017_06_haystack/get_data_script"
#proj_dir <- "/data/pinello/PROJECTS/2017_06_HAYSTACK/haystack_bio/get_data_script"
sample_names_dir <- file.path(proj_dir, "sample_names")
#dir.create(sample_names_dir)
output_dir <- file.path(proj_dir,"files_source_urls")
data_dir <- "/data/molpath/common_data/roadmap"
bigwig_dir <- file.path(data_dir, "bigwig")
rpkm_output_dir <- file.path(data_dir, "gene_expression" ,"rpkm")
#dir.create(rpkm_output_dir)
download.file("http://egg2.wustl.edu/roadmap/data/byDataType/rna/expression/EG.name.txt",
"EG.name.txt")
eg_names <- fread(input = "EG.name.txt",
header = FALSE,
col.names = c("epigenome",
"celltypes")) %>%
filter(epigenome != "E000" )
eg_names
ah <- AnnotationHub()
# Searching for epigenomes in the roadmap database
epiFiles <- query(ah, c("EpigenomeRoadMap", "^E"))
epiFiles <- query(epiFiles , c( "H3K27ac.fc.signal.bigwig",
"H3K27me3.fc.signal.bigwig",
"H3K4me3.fc.signal.bigwig",
"DNase.fc.signal.bigwig",
"WGBS_FractionalMethylation.bigwig"),
pattern.op= `|`)
epiFiles
## AnnotationHub with 442 records
## # snapshotDate(): 2017-04-25
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: BigWigFile
## # additional mcols(): taxonomyid, genome, description,
## # coordinate_1_based, maintainer, rdatadateadded, preparerclass,
## # tags, rdatapath, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH32003"]]'
##
## title
## AH32003 | E001-H3K4me3.fc.signal.bigwig
## AH32006 | E001-H3K27me3.fc.signal.bigwig
## AH32009 | E002-H3K4me3.fc.signal.bigwig
## AH32012 | E002-H3K27me3.fc.signal.bigwig
## AH32014 | E003-DNase.fc.signal.bigwig
## ... ...
## AH49524 | E105_WGBS_FractionalMethylation.bigwig
## AH49525 | E106_WGBS_FractionalMethylation.bigwig
## AH49526 | E109_WGBS_FractionalMethylation.bigwig
## AH49527 | E112_WGBS_FractionalMethylation.bigwig
## AH49528 | E113_WGBS_FractionalMethylation.bigwig
RMEP_dt <- data.table(title= epiFiles$title,
sourceurl= epiFiles$sourceurl) %>%
bind_cols(do(., data.frame(
str_split(.$title,
"-|.fc|_WGBS_",
simplify = TRUE)[,1:2],
stringsAsFactors = FALSE))) %>%
select(-title) %>%
rename( epigenome= "X1",
mark= "X2") %>%
mutate(mark=replace(mark,
mark== "FractionalMethylation.bigwig",
"methylation"))
marks <- RMEP_dt %>%
select(mark) %>%
unique %>%
unlist
RMEP_dt
epifiles_dt<- RMEP_dt %>%
dcast( epigenome ~ mark )
## Using 'mark' as value column. Use 'value.var' to override
epifiles_dt
Epigenomes that have gene expression data
epifiles_dt <- left_join(eg_names,
epifiles_dt,
by='epigenome')
epifiles_dt
Number of epigenomes with gene expression data that have 1, 2, 3, or 4 of the marks (DNase, methylation, H3K27ac, H3K27me3 )
epifiles_dt$marks_num <- epifiles_dt %>%
select(-epigenome, -celltypes) %>%
is.na %>%
`!` %>%
rowSums
epifiles_dt
epifiles_dt %>%
select(marks_num) %>%
table
## .
## 2 3 4 5
## 1 13 29 13
Epigenomes with all 5 marks
epifiles_dt %>%
filter(marks_num==5)
Epigenomes with 4 or 5 marks
selected_epigenomes <- epifiles_dt %>%
filter(marks_num >=4) %>%
arrange(marks_num) %>%
filter(epigenome!="E056")
selected_epigenomes
for (mark_target in marks){
RMEP_dt %>%
filter(epigenome %in% selected_epigenomes$epigenome) %>%
filter(mark == mark_target ) %>%
select(sourceurl) %>%
fwrite(file= file.path(output_dir, paste0(mark_target,"_urls.txt")),
sep='\t',
col.names = FALSE)
}
ah <- AnnotationHub()
## snapshotDate(): 2017-04-25
RPKM_query <- query(ah, c("EpigenomeRoadMap", "exon.RPKM.pc"))
RPKM_query
## AnnotationHub with 1 record
## # snapshotDate(): 2017-04-25
## # names(): AH49026
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: data.frame
## # $rdatadateadded: 2015-08-10
## # $title: 57epigenomes.exon.RPKM.pc.gz
## # $description: RPKM expression matrix for protein coding exons
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: Zip
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byDataType/rna/express...
## # $sourcesize: 32882572
## # $tags: c("EpigenomeRoadMap", "expression", "quantification",
## # "hg19")
## # retrieve record with 'object[["AH49026"]]'
RPKM_dt <- ah[["AH49026"]]
## loading from cache '/home/rick//.AnnotationHub/55332'
## require("SummarizedExperiment")
RPKM_dt
## class: RangedSummarizedExperiment
## dim: 233480 56
## metadata(0):
## assays(1): ''
## rownames(233480): chr10:100003848-100004654<1
## chr10:100007447-100008748<-1 ... chrY:9384257-9384264<-1
## chrY:9384401-9384693<-1
## rowData names(2): gene_id E000
## colnames(56): E003 E004 ... E127 E128
## colData names(0):
rpkm_genes <- data.frame(gene_id =
rowData(RPKM_dt)$gene_id %>%
as.character,
stringsAsFactors = FALSE) %>%
cbind( . ,
RPKM_dt %>%
assay() %>%
data.frame()
)
rpkm_genes
Define biomart object
mart <- useMart(biomart = "ensembl",
dataset = "hsapiens_gene_ensembl")
#Get gene names
results <- getBM(attributes = c("ensembl_gene_id",
"external_gene_name"),
filters = "ensembl_gene_id",
values = rpkm_genes$gene_id,
mart = mart)
results
rename
results <- results %>%
dplyr::rename(gene_id ="ensembl_gene_id")
results
rpkm_genes <- full_join(rpkm_genes,
results,
by = "gene_id")
rpkm_genes
rpkm_genes_avg <- rpkm_genes %>%
select(-gene_id) %>%
group_by(external_gene_name) %>%
summarise_all(funs(mean))
rpkm_epigenomes <- colnames(rpkm_genes_avg)[-1]
rpkm_genes_avg